package ltd.zhaocheng.wm;

import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Html;

import java.util.List;

/**
 * @author: zhaocheng
 * @TIME: 2020/3/21 20:07
 */
public class LianjiaPageProcessor implements PageProcessor {

    private Site site = Site.me().setRetryTimes(3).setSleepTime(1000).setTimeOut(10000);

    @Override
    public void process(Page page) {
        Html html = page.getHtml();
        //房源列表
        List<String> list = html.xpath("//p[@class='content__list--item--title twoline']/a").links().all();
        //加入到要抓取的待抓取列表里
        page.addTargetRequests(list);
        //抓取标题
        page.putField("title",html.xpath("//div[@class='content clear w1150']/p/text()").toString());
        page.putField("rent",html.xpath("//div[@class='content__aside--title']/span/text()").toString());
        page.putField("type",html.xpath("//ul[@class='content__aside__list']/allText()").toString());
        page.putField("info",html.xpath("//div[@class='content__article__info']/allText()").toString());
        page.putField("img",html.xpath("//div[@class='content__article__slide__item']/img").toString());
        System.out.println("-----------------------------------");

        if (page.getResultItems().get("title") == null) {
            //如果这个页面里面不包含标题,则不是商品详情页,就跳过这个页面
            page.setSkip(true);
            //分页
            for (int i = 1; i <= 100; i++) {
                page.addTargetRequest("https://sh.lianjia.com/zufang/pg" + i);
            }
        }
    }

    @Override
    public Site getSite() {
        return site;
    }

    public static void main(String[] args) {
        Spider.create(new LianjiaPageProcessor())
                .addUrl("https://sh.lianjia.com/zufang/")
                .addPipeline(new SavePipeline())
                .thread(5).run();
    }
}
